import requests
import threading
import csv

url_1 = 'https://www.zhihu.com/api/v4/members/'
url_2 = '/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&limit=20&offset='

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'Host': 'www.zhihu.com',
    'Referer': 'https://www.zhihu.com/people/zhang-jia-wei/following?page=2',
    'Cookie': 'xxxx'
}

to_crawl = ['crossin']
crawled = []
all_user = []
finished = threading.Event()

def crawl(url):
    global to_crawl, crawled, finished
    req = requests.get(url=url, headers=headers)
    data = req.json()

    for user in data['data']:
        if user['follower_count'] > 600000:
            token = user['url_token']
            if token not in to_crawl and token not in crawled:
                print(user['name'])
                to_crawl.append(token)
                all_user.append([token, user['name'], user['follower_count'],user['is_following']])
                print('add token', token)
                finished.set()
    return data['paging']

def get_following(user):
    print('crawling', user)
    global to_crawl, crawled
    url = url_1 + user + url_2 + '0'

    paging = crawl(url)

    totals = paging['totals']
    count = 20
    while count < totals and count < 1000:
        url = url_1 + user + url_2 + str(count)
        t = threading.Thread(target=crawl, args=(url,))
        t.start()
        count += 20

    print('to_crawl', to_crawl)
    print('crawled', crawled)

while len(to_crawl) > 0:
    user = to_crawl.pop()
    crawled.append(user)

    get_following(user)

    while len(to_crawl) == 0 and threading.active_count() > 1:
        print(to_crawl, crawled)
        print('wait', threading.active_count())
        finished.clear()
        finished.wait(3)

with open('zhihuV.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['token','昵称','关注数','是否关注'])
    for data in all_user:
        writer.writerow(data)
